################### PROJECT- JCR REPLICATION - data_processing_01 ##############################################
# This R file combines the raw dataset and creates the year-level dataset for shifts in territorial control. 
# Datasets generated from this codes include dataset1_oldschool.csv and dataset1_staggeredadoption.csv.
# They have all variables needed for main analysis for shifts in territorial control and corresponding tests in the Appendix
# Last updated: 16th, May 2025, by Wangyin Zhao
##############################################################################################################

##############################################################################################################
# Outline of Script
# A. Preparation
#    - Set working directory
#    - Load required packages
#
# B. Import Raw Datasets
#
# C. Process Each Variable
#    1. Territorial Control Data
#    2. The occurrence of rainfall-related natural disasters    
#    3. The territorial setting (spatial configuration of territorial control)   
#       3.1 The spatial lag of the territorial setting
#       3.2 The continuum of the spatial configuration of the territorial control
#    4. Night Light as a control (for Appendix E)
#         
# D. Merge each dataset and create the first dataset for the old school DID
#   
# E. Create the second dataset for the staggered adoption version
##############################################################################################################

#### A. Preparation ####
# Set working directory to Replication_Files folder
setwd("./Replication_Files")

# Import relevant packages
packages <- c("sf", "tidyverse", "plm", "exactextractr", "spdep", "zoo")
lapply(packages, library, character.only = TRUE)
########


#### B. Import raw datasets: ####
# The spatial data on the barangays from the Philippine Standard Geographic Code (PSGC) 
barangay <- st_read("./data/raw/Barangays/Barangays.shp")

# Barangay-level territorial control data:
control <- st_read("./data/raw/CNN_Brgy_Affectation_2011-2014.csv")

# The data on rainfall-related natural disasters:
# the percipitation data is from the IMERG: Integrated Multi-satellitE Retrievals for GPM. 
# .nc or tif. files for the precipitation for each month and grid can be downloaded from https://gpm.nasa.gov/data/imerg (better for batch downloading) 
# https://giovanni.gsfc.nasa.gov/giovanni/#service=TmAvMp&starttime=&endtime=&dataKeyword=IMERG%20Final (for a more access version)

# the data on the nightlight 
# the nightlight data is derived from the the VIIRS Day/Night Band (DNB) onboard the Joint Polar-orbiting Satellite System (JPSS) satellites 
# tif. files for the nightlight can be downloaded from https://eogdata.mines.edu/products/vnl/#daily

########

#### C. Process each variable: ####

##### 1. Barangay-level territorial control data: ####

# First, for the spatial analysis later, merge the territorial control data 
# with the geo-spatial data on the barangay in the Philippines.
control <- control %>% dplyr::select(Region_PSGC_Name, Region_PSGC_ID_mod, Province_PSGC_Name, 
                                     Province_PSGC_ID_mod, Muni_PSGC_Name, Muni_PSGC_ID_mod, 
                                     Brgy_PSGC_Name, Brgy_PSGC_ID, year, 
                                     Infl, LessInfl, Threat, aff, Infl2, aff2, Infl3) 

control <- merge(barangay, control, by.x='ADM_ID', by.y='Brgy_PSGC_ID', all.y=TRUE)

control <- control %>% pivot_wider(names_from = "year",
                                   values_from = c("Infl", "LessInfl", "Threat", 
                                                   "aff", "Infl2", "aff2", "Infl3")) # transfer it into barangay level for visualisation purpose and would transfer it back later

# Second, begin to construct the outcome of shifts in territorial control 
# based on the difference between the territorial control status in the given year and the previous year 
control_shift <- st_make_valid(control)

control_shift <- control_shift %>% dplyr::select(ADM4_PCODE, Infl2_2011, Infl2_2012, Infl2_2013, Infl2_2014) %>%
  pivot_longer(cols = starts_with("Infl2"), 
               names_to = "year", 
               values_to = "Influence2") %>%
  mutate(year = str_sub(year,start=-4)) %>% 
  st_drop_geometry() %>% 
  mutate(ID_year = paste (ADM4_PCODE, year, sep = "_"), 
         year = as.numeric(year),
         Influence2 = as.numeric(Influence2)) %>%
  dplyr::select(ADM4_PCODE, year, Influence2)
  # transfer the dataset back to the barangay-year level


control_shift <- control_shift %>% arrange(ADM4_PCODE, year) %>%
  group_by(ADM4_PCODE) %>%
  mutate(inf2_lag = dplyr::lag(Influence2),
         Tchange_inf2 = Influence2 - inf2_lag)
  # +1 refers to the territory change from the government side to the rebel side
  # -1 refers to the territory change from the rebel side to the government side

control_shift$Tchange_inf2[control_shift$Tchange_inf2 == -1] <- 1 # combine +1 and -1 into one, representing the occurence of the shift of territorial control
########

#### 2. The occurrence of rainfall-related natural disasters: ####
# As the sizes are too big, I have not included the raw data for all precipitation tif. files for each grid at each month
# the precipitation data is from the IMERG: Integrated Multi-satellite Retrievals for GPM. 
# .nc or tif. files for the precipitation for each month and grid can be downloaded from https://gpm.nasa.gov/data/imerg (better for batch downloading)
# https://giovanni.gsfc.nasa.gov/giovanni/#service=TmAvMp&starttime=&endtime=&dataKeyword=IMERG%20Final (for a more access version)

# I include the data on precipitation mean at the barangay and month level after processing and the codes for the processing all spatial files (.tif) 

# Processing codes: 

# rainfallfilelist <- list.files("data/phl/", full.names = T, pattern = "GIOVANNI-*") 
# rainfallfilesdf <- data.frame(filename=rainfallfilelist, yearmonth=stringr::str_split(basename(rainfallfilelist), pattern = "_") %>% 
#                    lapply(., function(x) x[4]) %>% unlist()) %>%
#                    mutate(yearmonth = str_sub(yearmonth,start=15, end=20))

# fulldf <- tibble()
# for(i in rainfallfilesdf$yearmonth){
  
# tt <- Sys.time()
# print(i)
  
# rainfallfilename <- rainfallfilesdf$filename[rainfallfilesdf$yearmonth==i] # name of corresponding NLT file at t-1
  
# rainfallfile <- terra::rast(rainfallfilename) # read NLT raster
  
# rainfallrast <- terra::crop(rainfallfile, phl) # crop nlt raster to gfo raster
# rainfallrast <- terra::mask(rainfallrast, phl)
  
# tmpres <- tibble(rainfallfile=rainfallfilename,
#                  ADM4_PCODE=phl$ADM4_PCODE,
#                  yearmonth=i,
#                  rainfallmean=exact_extract(rainfallrast, phl, "mean"))
  
# fulldf <- bind_rows(fulldf,tmpres)
# print(Sys.time()-tt)}

# Below codes are about coding the occurence of natural disasters at the barangay-year level step by step
# First, upload the processing data for the precipitation mean at the barangay-month level and calculate the z-value
fulldf <- read.csv("./data/analysis/rainfall_zonal stat for phl.csv")  %>%
  mutate(yearmonth = as.numeric(yearmonth)) %>%
  dplyr::select(ADM4_PCODE, yearmonth, rainfallmean) %>%
  mutate(
    year = as.integer(str_sub(yearmonth, 1, 4)),
    month = as.integer(str_sub(yearmonth, 5, 6))) # some data cleaning


output_f <- fulldf %>%
  group_by(ADM4_PCODE, year, month) %>%
  summarise(rainfallmean = mean(rainfallmean, na.rm = TRUE), .groups = "drop") %>%
  arrange(ADM4_PCODE, year, month) %>%
  group_by(ADM4_PCODE, month) %>%
  mutate(roll = rollmean(rainfallmean, k = 11, fill = NA, align = "center"),
         roll_sd = rollapply(rainfallmean, width = 11, FUN = sd, fill = NA, align = "center")) %>%
  # calculate the averaging mean and standard deviation
  ungroup() %>%
  filter(year > 2008, year < 2015) %>%
  mutate(z_value = (rainfallmean - roll) / roll_sd) %>%
  drop_na() # this code creates the z_value for the precipitation mean for each barangay at each month 
            

# Second, calculate the occurrence of rainfall-related natural disasters at the barangay-year level based on the output_f
nd_year <- output_f %>% 
  mutate(z_value = as.numeric(z_value),
         ND_1.5_bar = if_else(z_value >= 1.5, 1, 0),
         ND_1.6_bar = if_else(z_value >= 1.6, 1, 0),
         ND_1.7_bar = if_else(z_value >= 1.7, 1, 0),
         ND_1.8_bar = if_else(z_value >= 1.8, 1, 0),
         ND_1.9_bar = if_else(z_value >= 1.9, 1, 0),
         ND_2_bar = if_else(z_value >= 2, 1, 0)) %>% 
  rename(z_value_bar = `z_value`) %>%
  na.omit() %>% 
  # code the occurence of rainfall-related natural disasters at the month level
  group_by(ADM4_PCODE, year) %>%
  summarise(ND_1.5_bar = if_else(mean(ND_1.5_bar) > 0, 1, 0), 
            ND_1.6_bar = if_else(mean(ND_1.6_bar) > 0, 1, 0), 
            ND_1.7_bar = if_else(mean(ND_1.7_bar) > 0, 1, 0), 
            ND_1.8_bar = if_else(mean(ND_1.8_bar) > 0, 1, 0),
            ND_1.9_bar = if_else(mean(ND_1.9_bar) > 0, 1, 0),
            ND_2_bar = if_else(mean(ND_2_bar) > 0, 1, 0)) %>%
  mutate(year = as.numeric(year))
  # transfer it into the year level 

# the final dataset on the occurrence of rainfall-related natural disasters at the barangay-year level ready for merging is nd_year

########

#### 3. The territorial setting (spatial configuration of territorial control) ####
# First, use the processed data on the territorial control - "control", generated in "1.",
# , create a neighbor list using queen contiguity,
# and convert the neighbor list to a row-standardized spatial weights list.
control_territorysetting <- st_make_valid(control) %>% mutate(Infl2_2011 = as.numeric(Infl2_2011),
                                             Infl2_2012 = as.numeric(Infl2_2012),
                                             Infl2_2013 = as.numeric(Infl2_2013),
                                             Infl2_2014 = as.numeric(Infl2_2014))
control_nb <- spdep::poly2nb(control_territorysetting, queen = TRUE)
control_nblist <- nb2listw(control_nb,style = "W", zero.policy = TRUE) 


# Second, I calculate the spatial lag: the weighted average of neighboring values, 
# in this case: the status of the territorial control, 
# and then I code the territorial setting based on the spatial lag for each year.
# 2011
control_territorysetting$Territory_inf2_2011 <- lag.listw(x = control_nblist, var = control_territorysetting$Infl2_2011, zero.policy = TRUE, NAOK = TRUE) 
control_territorysetting$Territory_inf2_2011[control_territorysetting$Territory_inf2_2011 > 0 & control_territorysetting$Territory_inf2_2011 < 1] <- "Mixed"
control_territorysetting$Territory_inf2_2011[control_territorysetting$Territory_inf2_2011 == 0 & control_territorysetting$Infl2_2011 == 0] <- "Homogenous"
control_territorysetting$Territory_inf2_2011[control_territorysetting$Territory_inf2_2011 == 1 & control_territorysetting$Infl2_2011 == 1] <- "Homogenous"
control_territorysetting$Territory_inf2_2011[control_territorysetting$Territory_inf2_2011 == 0 & control_territorysetting$Infl2_2011 == 1] <- "Enclave"
control_territorysetting$Territory_inf2_2011[control_territorysetting$Territory_inf2_2011 == 1 & control_territorysetting$Infl2_2011 == 0] <- "Enclave"

# 2012
control_territorysetting$Territory_inf2_2012 <- lag.listw(x = control_nblist, var = control_territorysetting$Infl2_2012, zero.policy = TRUE, NAOK = TRUE) 
control_territorysetting$Territory_inf2_2012[control_territorysetting$Territory_inf2_2012 > 0 & control_territorysetting$Territory_inf2_2012 < 1] <- "Mixed"
control_territorysetting$Territory_inf2_2012[control_territorysetting$Territory_inf2_2012 == 0 & control_territorysetting$Infl2_2012 == 0] <- "Homogenous"
control_territorysetting$Territory_inf2_2012[control_territorysetting$Territory_inf2_2012 == 1 & control_territorysetting$Infl2_2012 == 1] <- "Homogenous"
control_territorysetting$Territory_inf2_2012[control_territorysetting$Territory_inf2_2012 == 0 & control_territorysetting$Infl2_2012 == 1] <- "Enclave"
control_territorysetting$Territory_inf2_2012[control_territorysetting$Territory_inf2_2012 == 1 & control_territorysetting$Infl2_2012 == 0] <- "Enclave"

# 2013
control_territorysetting$Territory_inf2_2013 <- lag.listw(x = control_nblist, var = control_territorysetting$Infl2_2013, zero.policy = TRUE, NAOK = TRUE) 
control_territorysetting$Territory_inf2_2013[control_territorysetting$Territory_inf2_2013 > 0 & control_territorysetting$Territory_inf2_2013 < 1] <- "Mixed"
control_territorysetting$Territory_inf2_2013[control_territorysetting$Territory_inf2_2013 == 0 & control_territorysetting$Infl2_2013 == 0] <- "Homogenous"
control_territorysetting$Territory_inf2_2013[control_territorysetting$Territory_inf2_2013 == 1 & control_territorysetting$Infl2_2013 == 1] <- "Homogenous"
control_territorysetting$Territory_inf2_2013[control_territorysetting$Territory_inf2_2013 == 0 & control_territorysetting$Infl2_2013 == 1] <- "Enclave"
control_territorysetting$Territory_inf2_2013[control_territorysetting$Territory_inf2_2013 == 1 & control_territorysetting$Infl2_2013 == 0] <- "Enclave"

# 2014
control_territorysetting$Territory_inf2_2014 <- lag.listw(x = control_nblist, var = control_territorysetting$Infl2_2014, zero.policy = TRUE, NAOK = TRUE) 
control_territorysetting$Territory_inf2_2014[control_territorysetting$Territory_inf2_2014 > 0 & control_territorysetting$Territory_inf2_2014 < 1] <- "Mixed"
control_territorysetting$Territory_inf2_2014[control_territorysetting$Territory_inf2_2014 == 0 & control_territorysetting$Infl2_2014 == 0] <- "Homogenous"
control_territorysetting$Territory_inf2_2014[control_territorysetting$Territory_inf2_2014 == 1 & control_territorysetting$Infl2_2014 == 1] <- "Homogenous"
control_territorysetting$Territory_inf2_2014[control_territorysetting$Territory_inf2_2014 == 0 & control_territorysetting$Infl2_2014 == 1] <- "Enclave"
control_territorysetting$Territory_inf2_2014[control_territorysetting$Territory_inf2_2014 == 1 & control_territorysetting$Infl2_2014 == 0] <- "Enclave"


# Third, convert the barangay level data on the territorial setting to the barangay-year level
# and reserve the data on the territorial control and convert it into barangay-year level, too
rebelcontrol <- control_territorysetting %>% dplyr::select(ADM4_PCODE, Infl2_2011, Infl2_2012, Infl2_2013, Infl2_2014) %>%
  pivot_longer(cols = starts_with("Infl2"), 
               names_to = "year", 
               values_to = "Influence2") %>%
  mutate(year = str_sub(year,start=-4)) %>% 
  mutate(ID_year = paste(ADM4_PCODE, year, sep = "_")) %>% 
  mutate(year = as.numeric(year))


territorialsetting <- control_territorysetting %>% dplyr::select(ADM4_PCODE, Territory_inf2_2011, Territory_inf2_2012, Territory_inf2_2013, Territory_inf2_2014) %>%
  pivot_longer(cols = starts_with("Territory_inf2"), 
               names_to = "year", 
               values_to = "T_inf2") %>%
  mutate(year = str_sub(year,start=-4)) %>% 
  st_drop_geometry() %>% 
  mutate(ID_year = paste(ADM4_PCODE, year, sep = "_")) %>% 
  mutate(year = as.numeric(year))


Territorialsettingf <- rebelcontrol %>% left_join(territorialsetting) # the dataset on the territorial setting and the territorial control for each barangay

Territorialsettingf$Enclave_inf2[Territorialsettingf$T_inf2 == "Enclave"] <- 1
Territorialsettingf$Enclave_inf2[Territorialsettingf$T_inf2 == "Homogenous" | Territorialsettingf$T_inf2 == "Mixed"] <- 0
Territorialsettingf$Mixed_inf2[Territorialsettingf$T_inf2 == "Mixed"] <- 1
Territorialsettingf$Mixed_inf2[Territorialsettingf$T_inf2 == "Homogenous" | Territorialsettingf$T_inf2 == "Enclave"] <- 0

########

#### 3.1 The spatial lag of the territorial setting: ####

year <- c(2011, 2012, 2013, 2014)

# enclave setting 
enclavelag <- tibble()
for(i in year){
  tt <- Sys.time()
  print(i)
  phl_each_year <- Territorialsettingf %>% filter(year==i)
  phl_f_nb <- spdep::poly2nb(phl_each_year, queen = TRUE)
  phl_f_nblist <- nb2listw(phl_f_nb,style = "W", zero.policy = TRUE) 
  
  tm <- tibble(ADM4_PCODE=phl_each_year$ADM4_PCODE,
               
               time= i,
               
               Enclave_inf2_nb=lag.listw(x = phl_f_nblist, var = phl_each_year$Enclave_inf2, zero.policy = TRUE, NAOK = TRUE))
  
  tt <- Sys.time()
  print(i)
  
  enclavelag <- bind_rows(enclavelag,tm)
  
  print(Sys.time()-tt)
}

enclavelag$Enclave_inf2_nb_d[enclavelag$Enclave_inf2_nb > 0] <- 1 
enclavelag$Enclave_inf2_nb_d[enclavelag$Enclave_inf2_nb == 0] <- 0


# mix setting
mixlag <- tibble()
for(i in year){
  tt <- Sys.time()
  print(i)
  phl_each_year <- Territorialsettingf %>% filter(year==i)
  phl_f_nb <- spdep::poly2nb(phl_each_year, queen = TRUE)
  phl_f_nblist <- nb2listw(phl_f_nb,style = "W", zero.policy = TRUE) 
  
  tm <- tibble(ADM4_PCODE=phl_each_year$ADM4_PCODE,
               
               time= i,
               
               Mixed_inf2_nb=lag.listw(x = phl_f_nblist, var = phl_each_year$Mixed_inf2, zero.policy = TRUE, NAOK = TRUE))
  
  tt <- Sys.time()
  print(i)
  
  mixlag <- bind_rows(mixlag,tm)
  
  print(Sys.time()-tt)
}

mixlag$Mixed_inf2_nb_d[mixlag$Mixed_inf2_nb > 0] <- 1 
mixlag$Mixed_inf2_nb_d[mixlag$Mixed_inf2_nb == 0] <- 0


enclavelag <- enclavelag %>% mutate(ID_year = paste(ADM4_PCODE, year, sep = "_"))
mixlag <- mixlag %>% mutate(ID_year = paste(ADM4_PCODE, year, sep = "_"))


Territorialsettingf_withspatiallag <- Territorialsettingf %>% mutate(ID_year = paste(ADM4_PCODE, year, sep = "_")) %>%
  left_join(enclavelag) %>%
  left_join(mixlag) %>% st_drop_geometry() # the final dataset on the territorial setting for each barangay in each month 

########

#### 3.2 The continuum of the spatial configuration of the territorial control: (for the Appendix Q) ####
# Instead of coding different spatial configuration as a categorical variable of territorial setting,
# Code it as a continuum 

# Repeat what I have done in 3: 
# First, plot the weighted matrix and calculate the spatial lag of the village
control_continuum <- control %>% st_make_valid() %>% mutate(Infl2_2011 = as.numeric(Infl2_2011),
                                                                Infl2_2012 = as.numeric(Infl2_2012),
                                                                Infl2_2013 = as.numeric(Infl2_2013),
                                                                Infl2_2014 = as.numeric(Infl2_2014))
control_continuum_nb <- spdep::poly2nb(control_continuum, queen = TRUE)
control_continuum_nblist <- nb2listw(control_continuum_nb,style = "W", zero.policy = TRUE) 

# Calculate the spatial lag
# 2011
control_continuum$Territory_inf2_2011 <- lag.listw(x = control_continuum_nblist, var = control_continuum$Infl2_2011, zero.policy = TRUE, NAOK = TRUE) 

# 2012
control_continuum$Territory_inf2_2012 <- lag.listw(x = control_continuum_nblist, var = control_continuum$Infl2_2012, zero.policy = TRUE, NAOK = TRUE) 

# 2013
control_continuum$Territory_inf2_2013 <- lag.listw(x = control_continuum_nblist, var = control_continuum$Infl2_2013, zero.policy = TRUE, NAOK = TRUE) 

# 2014
control_continuum$Territory_inf2_2014 <- lag.listw(x = control_continuum_nblist, var = control_continuum$Infl2_2014, zero.policy = TRUE, NAOK = TRUE) 



control_continuum_1 <- control_continuum %>% dplyr::select(ADM4_PCODE, Infl2_2011, Infl2_2012, Infl2_2013, Infl2_2014) %>%
  pivot_longer(cols = starts_with("Infl2"), 
               names_to = "year", 
               values_to = "Influence2") %>%
  mutate(year = str_sub(year,start=-4)) # include the data on the rebel control



control_continuum_2 <- control_continuum %>% dplyr::select(ADM4_PCODE, Territory_inf2_2011, Territory_inf2_2012, Territory_inf2_2013, Territory_inf2_2014) %>%
  pivot_longer(cols = starts_with("Territory_inf2"), 
               names_to = "year", 
               values_to = "T_inf2") %>%
  mutate(year = str_sub(year,start=-4)) %>% 
  st_drop_geometry() %>% 
  mutate(ID_year = paste(ADM4_PCODE, year, sep = "_")) 

control_continuum_f <- control_continuum_1 %>% mutate(ID_year = paste(ADM4_PCODE, year, sep = "_")) %>%
  left_join(control_continuum_2) %>%
  mutate(Influence2 = as.numeric(Influence2)) %>%
  mutate(year = as.numeric(year)) %>%
  mutate(T_inf2_0.1 = if_else(T_inf2 >= 0.1, 1, 0),
         T_inf2_0.2 = if_else(T_inf2 >= 0.2, 1, 0),
         T_inf2_0.3 = if_else(T_inf2 >= 0.3, 1, 0),
         T_inf2_0.4 = if_else(T_inf2 >= 0.4, 1, 0),
         T_inf2_0.5 = if_else(T_inf2 >= 0.5, 1, 0),
         T_inf2_0.6 = if_else(T_inf2 >= 0.6, 1, 0),
         T_inf2_0.7 = if_else(T_inf2 >= 0.7, 1, 0),
         T_inf2_0.8 = if_else(T_inf2 >= 0.8, 1, 0),
         T_inf2_0.9 = if_else(T_inf2 >= 0.9, 1, 0)) %>%
  # code different levels of intensity (proportions)
  rename("T_inf2_prop" = "T_inf2") %>% st_drop_geometry()# change the name to avoid the confusion with the T_inf2 variable for the territorial setting



#########

#### 4. Night Light as a control (for Appendix E): ####
# The nightlight data is derived from the the VIIRS Day/Night Band (DNB) onboard the Joint Polar-orbiting Satellite System (JPSS) satellites 
# tif. files for the nightlight can be downloaded from https://eogdata.mines.edu/products/vnl/#daily

# below are codes for processing. After downloading the annual level nightlight data, you can replace n2012, n2013, and n2014 with your downloaded ones
# nl$n2012 <- exact_extract(n2012, barangay, "mean")
# nl$n2013 <- exact_extract(n2013, barangay, "mean")
# nl$n2014 <- exact_extract(n2014, barangay, "mean")

# nl_long <- nl %>%
#  select(-geometry) %>%  # Remove geometry if present
#  pivot_longer(cols = starts_with("n20"), names_to = "year", values_to = "value") %>%
#  mutate(year = gsub("n", "", year))  # Remove 'n' to keep only the year

# nl_long <- nl_long %>% dplyr::select(ADM4_PCODE, year, value) %>% 
#  rename(NightLight = value) %>% 
#  st_drop_geometry() 

# the data after processing is uplaoded here as nl_long
nl_long <- st_read("./data/analysis/nightlight.csv") %>% mutate(NightLight = as.numeric(NightLight),
                                                                year = as.numeric(year)) 

########

#### D. Merge each dataset and create the first dataset for the old school DID ####
# 1. Merge each dataset together 
# now the data for merging is well prepared, and they include:
dataset1 <- control_shift %>% 
  # the data on the shift of territorial control
  left_join(nd_year) %>% 
  # the data on the occurrence of rainfall related natural disasters
  left_join(Territorialsettingf_withspatiallag) %>% 
  # the data on the territorial setting used to measure the spatial configuration of territorial control, and its spatial lag
  left_join(control_continuum_f) %>% st_drop_geometry() %>%
  #the data on the proportion of surrounding territories under enemy control used to measure spatial configuration of territorial control
  left_join(nl_long)

# 2. Create the time lag for the territorial setting
dataset1 <- dataset1 %>% arrange(ADM4_PCODE, year) %>%
  group_by(ADM4_PCODE) %>%
  mutate(Enclave_inf2 = dplyr::lag(Enclave_inf2, n=1)) %>%
  mutate(Mixed_inf2 = dplyr::lag(Mixed_inf2, n=1)) %>%
  mutate(Enclave_inf2_nb_d = dplyr::lag(Enclave_inf2_nb_d, n=1)) %>%
  mutate(Mixed_inf2_nb_d = dplyr::lag(Mixed_inf2_nb_d, n=1)) 

# 3. add the lat and lon for Conley standard error in Appendix I
latlon <- barangay %>% dplyr::select(ADM3_PCODE, ADM4_PCODE, LATITUDE, LONGITUDE) %>% 
  st_make_valid() %>% filter(ADM4_PCODE != "PH097332") %>% st_drop_geometry()

dataset1 <- latlon %>% right_join(dataset1) 


# 4. prepare the variable for the proportion of the territories controlled by the enemy 
dataset1 <- dataset1 %>%
  mutate(T_inf2_prop = as.numeric(T_inf2_prop)) %>%
  # convert the class for the calculation purpose
  mutate(Enemy_around = if_else(Enclave_inf2 > 0 | Mixed_inf2 > 0, 1, 0),
         # it represents whther there exist any surrounding territories controlled by the enemy
         Prob.rebel = T_inf2_prop,
         Prob.gov = 1 - T_inf2_prop,
         Enemy_0.1 = if_else((Influence2 == 1 & Prob.gov >= 0.1) | (Influence2 == 0 & Prob.rebel >= 0.1), 1, 0),
         # whether there exist more than 10% of surrounding territories controlled by the enemy
         Enemy_0.2 = if_else((Influence2 == 1 & Prob.gov >= 0.2) | (Influence2 == 0 & Prob.rebel >= 0.2), 1, 0),
         # 20% - the below represents 20 - 100%
         Enemy_0.3 = if_else((Influence2 == 1 & Prob.gov >= 0.3) | (Influence2 == 0 & Prob.rebel >= 0.3), 1, 0),
         Enemy_0.4 = if_else((Influence2 == 1 & Prob.gov >= 0.4) | (Influence2 == 0 & Prob.rebel >= 0.4), 1, 0),
         Enemy_0.5 = if_else((Influence2 == 1 & Prob.gov >= 0.5) | (Influence2 == 0 & Prob.rebel >= 0.5), 1, 0),
         Enemy_0.6 = if_else((Influence2 == 1 & Prob.gov >= 0.6) | (Influence2 == 0 & Prob.rebel >= 0.6), 1, 0),
         Enemy_0.7 = if_else((Influence2 == 1 & Prob.gov >= 0.7) | (Influence2 == 0 & Prob.rebel >= 0.7), 1, 0),
         Enemy_0.8 = if_else((Influence2 == 1 & Prob.gov >= 0.8) | (Influence2 == 0 & Prob.rebel >= 0.8), 1, 0),
         Enemy_0.9 = if_else((Influence2 == 1 & Prob.gov >= 0.9) | (Influence2 == 0 & Prob.rebel >= 0.9), 1, 0),
         Enemy_1 = if_else((Influence2 == 1 & Prob.gov >= 1) | (Influence2 == 0 & Prob.rebel >= 1), 1, 0)) %>%
  arrange(ADM4_PCODE, year) %>% group_by(ADM4_PCODE) %>% 
  mutate(Enemy_0.1 = dplyr::lag(Enemy_0.1, n=1),
         Enemy_0.2 = dplyr::lag(Enemy_0.2, n=1),
         Enemy_0.3 = dplyr::lag(Enemy_0.3, n=1),
         Enemy_0.4 = dplyr::lag(Enemy_0.4, n=1),
         Enemy_0.5 = dplyr::lag(Enemy_0.5, n=1),
         Enemy_0.6 = dplyr::lag(Enemy_0.6, n=1),
         Enemy_0.7 = dplyr::lag(Enemy_0.7, n=1),
         Enemy_0.8 = dplyr::lag(Enemy_0.8, n=1),
         Enemy_0.9 = dplyr::lag(Enemy_0.9, n=1),
         Enemy_1 = dplyr::lag(Enemy_1, n=1)) 
# the time lag is created to ensure that variable represents the correct year, as the NPA intellegent assessment collects the data at the end of each year.

                    
dataset1 <- dataset1 %>% dplyr::select(ADM3_PCODE, ADM4_PCODE, LATITUDE, LONGITUDE, year, ID_year, 
                                       Influence2, Tchange_inf2, ND_1.5_bar, ND_1.6_bar, ND_1.7_bar, ND_1.8_bar, ND_1.9_bar, ND_2_bar,
                                       T_inf2, Enclave_inf2, Mixed_inf2, Enclave_inf2_nb_d, Mixed_inf2_nb_d, 
                                       Enemy_around, Enemy_0.1, Enemy_0.2, Enemy_0.3, Enemy_0.4, Enemy_0.5, Enemy_0.6, Enemy_0.7, Enemy_0.8, Enemy_0.9,
                                       NightLight)

write_csv(dataset1, "./data/analysis/dataset1_oldschool.csv", na = "NA") 
# the is the first year-level dataset for DID old school analysis, and can be found in the file - replication_files/data/analysis;
# it focuses on shifts in territorial control as the outcome

########

#### E. Create the second dataset for the staggered adoption design ####
# Make the staggered adoption 
# First, Convert 'time' to a proper Date object 
dataset1 <- dataset1 %>%
  mutate(year = as.numeric(year),
         numeric_time = (year - 2012))

# Second, code the first hit 
dataset1 <- dataset1 %>%
  group_by(ADM4_PCODE) %>%
  mutate(
    first_disaster_year = min(numeric_time[ND_1.6_bar == 1], na.rm = TRUE),  
    # First disaster
    treated = ifelse(numeric_time >= first_disaster_year, 1, 0)) %>%
    # Treated after first disaster 
  ungroup()

# Third, identify second disaster time
dataset1 <- dataset1 %>%
  group_by(ADM4_PCODE) %>%
  mutate(
    second_disaster_time = nth(numeric_time[ND_1.6_bar == 1], 2, default = NA)  # Second disaster time
  ) %>%
  ungroup()

#### Calculate the spatial lag for the treat for the robustness test (Appendix C) ####
map <- barangay %>% st_make_valid() %>% 
  dplyr::select(ADM4_PCODE) %>% filter(ADM4_PCODE != "PH097332") 


fulldf_1 <-  map %>% right_join(dataset1) #HERE THERE ARE SOMETHING WRONG WITH PH097332, WHICH CORRESPONDS TO THREE ROWS

year <- c(2011, 2012, 2013, 2014)
df_treated <- tibble()
for(i in year){
  tt <- Sys.time()
  print(i)
  fulldf_1_each_time <- fulldf_1 %>% filter(year == i )
  fulldf_1_nb <- spdep::poly2nb(fulldf_1_each_time, queen = TRUE)
  fulldf_1_nblist <- nb2listw(fulldf_1_nb,style = "W", zero.policy = TRUE) 
  
  tmpres <- tibble(ADM4_PCODE=fulldf_1_each_time$ADM4_PCODE,
                   
                   time= i,
                   
                   ND_nb=lag.listw(x = fulldf_1_nblist, var = fulldf_1_each_time$treated, zero.policy = TRUE, NAOK = TRUE))
  
  tt <- Sys.time()
  print(i)
  
  df_treated <- bind_rows(df_treated,tmpres)
  
  print(Sys.time()-tt)
}

table(df_treated$ND_nb)

df_treated$ND_nb_d[df_treated$ND_nb > 0] <- 1
df_treated$ND_nb_d[df_treated$ND_nb == 0] <- 0
df_treated <- df_treated %>% rename(NDtreated_nb = ND_nb) %>% 
  rename(NDtreated_nb_d = ND_nb_d)
df_treated_1 <- df_treated[df_treated$ADM4_PCODE != "PH097332",]


df_treated_1 <- df_treated_1 %>% mutate(ID_year = paste(ADM4_PCODE, time, sep = "_")) 


fulldf_1 <- fulldf_1[fulldf_1$ADM4_PCODE != "PH097332",]

df_f <- fulldf_1 %>% mutate(ID_year = paste(ADM4_PCODE, year, sep = "_")) %>%
  left_join(df_treated_1) 

df_f <- df_f %>% filter(year > 2010 & year < 2015) %>% dplyr::select(ADM4_PCODE, year, NDtreated_nb, NDtreated_nb_d) %>% st_drop_geometry() %>%
  mutate(year = as.numeric(year))


dataset1 <- dataset1 %>% left_join(df_f)


# Fourth, Exclude months after the second disaster
dataset1_exlcude_second <- dataset1 %>%
  filter(is.na(second_disaster_time) | numeric_time <= second_disaster_time)

# Fifth, I exclude disaster-affected barangays in 2010 and 2011. 
test <- output_f %>% filter(year == 2010 | year == 2011) %>% 
  filter(z_value >= 1.6) %>% 
  # output_f is the data on the precipitation deviation generated in 2)
  group_by(ADM4_PCODE) %>% 
  summarise(n=n()) # calculate affected barangays in 2010 and 2011

dataset1_filtered_2010_2011 <- dataset1_exlcude_second %>%
  filter(!ADM4_PCODE %in% test$ADM4_PCODE)

dataset1_filtered_2010_2011 <- dataset1_filtered_2010_2011 %>% dplyr::select(ADM3_PCODE, ADM4_PCODE, LATITUDE, LONGITUDE, year, ID_year,
                                                                             Influence2, Tchange_inf2, 
                                                                             treated, NDtreated_nb_d, Enclave_inf2, Enclave_inf2_nb_d, Mixed_inf2, Mixed_inf2_nb_d,
                                                                             Enemy_around, Enemy_0.1, Enemy_0.2, Enemy_0.3, Enemy_0.4, Enemy_0.5, Enemy_0.6, Enemy_0.7, Enemy_0.8, Enemy_0.9,
                                                                             NightLight)

write_csv(dataset1_filtered_2010_2011, "./data/analysis/dataset1_staggeredadoption.csv", na = "NA") 
# the is the second  year-level dataset for staggered adoption analysis, and can be found in the file - replication_files/data/analysis;
# it focuses on shifts in territorial control as the outcome
########




